Calculate time effect for timecourse of abdo and glut samples
Adding day15 floating and bulk samples to get DE stats on the
combined timecourse effect (with floating and bulk separate??).
NB: I added robust=TRUE to the eBayes calls on 27/3/20, but the
files haven’t been regenerated. So robust=FALSE there.
(with fractionally assigned multimapping alignments). Then format the table.
ous_dir = "C:/Users/sarahhp/OneDrive - Universitetet i Oslo/OUS"
d13_file = file.path(ous_dir, "D13rnaseqDec19/02featureCounts/d13_rnaseq.multim.counts")
d13_tab = read.delim(d13_file, skip=1, header=T, stringsAsFactors = FALSE)
colnames(d13_tab) = gsub("output.01hisat.","", gsub(".sorted.bam","",colnames(d13_tab)))
#head(d13_tab) #table is messy so I won't print it here
Load file with sample info and match sample information to columns in FC table
info_file = file.path(ous_dir,"D13rnaseqDec19/sample_info.csv")
d13_info = read.delim(info_file, header=TRUE, stringsAsFactors = FALSE, sep = ",")
#head(d13_info)
d13_info$sample_id = gsub("-",".", d13_info$Sequencing_name)
#remove unnecessary columns from d13_info
names(d13_info)[names(d13_info)=='Additional..fill.in.2.3.digits.code..max.5..construct.etc..'] = 'donor'
names(d13_info)[names(d13_info)=='Replicate..fill.in.number.'] = 'biorep'
names(d13_info)[names(d13_info)=='timept'] = 'time'
d13_info = d13_info[c('sample_id','time','donor','biorep','Tube.Label')]
#Checked that FC table and d13_info are in the same order
#colnames(d13_tab)
#d13_info$sample_id
And generate sample information from the bam file names
d7_file = file.path(ous_dir,"SeptRNAseq/02featureCounts/mar19_rnaseq.multim.counts")
d7_tab = read.delim(d7_file, skip=1, header=T, stringsAsFactors = FALSE)
colnames(d7_tab) = gsub("output.01hisat.","", gsub(".sorted.bam","",colnames(d7_tab)))
head(d7_tab)
## Geneid Chr
## 1 ENSG00000223972 1;1;1;1;1;1;1;1;1
## 2 ENSG00000227232 1;1;1;1;1;1;1;1;1;1;1
## 3 ENSG00000278267 1
## 4 ENSG00000243485 1;1;1;1;1
## 5 ENSG00000284332 1
## 6 ENSG00000237613 1;1;1;1;1
## Start
## 1 11869;12010;12179;12613;12613;12975;13221;13221;13453
## 2 14404;15005;15796;16607;16858;17233;17606;17915;18268;24738;29534
## 3 17369
## 4 29554;30267;30564;30976;30976
## 5 30366
## 6 34554;35245;35277;35721;35721
## End
## 1 12227;12057;12227;12721;12697;13052;13374;14409;13670
## 2 14501;15038;15947;16765;17055;17368;17742;18061;18366;24891;29570
## 3 17436
## 4 30039;30667;30667;31109;31097
## 5 30503
## 6 35174;35481;35481;36073;36081
## Strand Length 1.KD4NT_S1 10.P6D3_S10 11.P6D9_S11 12.KD3NT_S12
## 1 +;+;+;+;+;+;+;+;+ 1735 1.51 3.06 8.41 4.82
## 2 -;-;-;-;-;-;-;-;-;-;- 1351 187.13 223.71 171.04 235.24
## 3 - 68 0.20 4.70 2.03 6.18
## 4 +;+;+;+;+ 1021 0.20 1.14 0.00 1.03
## 5 + 138 0.00 0.00 0.00 0.00
## 6 -;-;-;-;- 1219 37.19 42.05 25.39 25.61
## 13.P8D0_S13 14.P8D1_S14 15.P8D3_S15 16.P8D9_S16 2.KD5NT_S2 3.P5D0_S3
## 1 3.00 3.34 2.58 3.08 2.87 2.41
## 2 287.59 243.51 187.14 138.19 96.13 189.13
## 3 2.98 2.95 1.86 2.86 0.45 0.17
## 4 1.48 0.70 0.00 0.00 0.00 0.43
## 5 0.00 0.00 0.00 0.00 0.00 0.00
## 6 69.98 55.23 33.27 19.05 26.12 25.11
## 4.P5D1_S4 5.P5D3_S5 6.P5D9_S6 7.KD6NT_S7 8.P6D0_S8 9.P6D1_S9
## 1 1.58 1.18 1.65 2.75 0.67 2.50
## 2 159.41 218.31 135.33 106.75 93.00 143.30
## 3 1.68 1.07 0.48 0.67 0.00 1.37
## 4 0.25 0.52 0.00 0.00 1.22 0.54
## 5 0.00 0.00 0.00 0.00 0.00 0.00
## 6 32.61 33.20 24.30 16.86 30.08 27.54
#Remove the file system artefacts from the count column names to create a list of the libraries:
sample_id = grep("[[:digit:]]",colnames(d7_tab), value = T)
sample_id
## [1] "1.KD4NT_S1" "10.P6D3_S10" "11.P6D9_S11" "12.KD3NT_S12" "13.P8D0_S13"
## [6] "14.P8D1_S14" "15.P8D3_S15" "16.P8D9_S16" "2.KD5NT_S2" "3.P5D0_S3"
## [11] "4.P5D1_S4" "5.P5D3_S5" "6.P5D9_S6" "7.KD6NT_S7" "8.P6D0_S8"
## [16] "9.P6D1_S9"
#Next create a sample table with information about samples and time points taken from the filenames.
Tube.Label = gsub("\\..*_S[[:digit:]]+", "", sample_id)
time = gsub(".*(KD|P)[43586]D?", "", gsub("_S.*","",sample_id))
time = factor(gsub("NT","-2",time)) #match factors
biorep = gsub(".*\\.(KD|P)", "P", gsub("(D[0139]|NT)_S.*","",sample_id))
biorep =gsub("P3","P8",biorep) #fix biorep error
donor = rep("D07-G", 16)
d7_info = data.frame(sample_id,time,donor,biorep,Tube.Label)
d7_info
## sample_id time donor biorep Tube.Label
## 1 1.KD4NT_S1 -2 D07-G P4 1
## 2 10.P6D3_S10 3 D07-G P6 10
## 3 11.P6D9_S11 9 D07-G P6 11
## 4 12.KD3NT_S12 -2 D07-G P8 12
## 5 13.P8D0_S13 0 D07-G P8 13
## 6 14.P8D1_S14 1 D07-G P8 14
## 7 15.P8D3_S15 3 D07-G P8 15
## 8 16.P8D9_S16 9 D07-G P8 16
## 9 2.KD5NT_S2 -2 D07-G P5 2
## 10 3.P5D0_S3 0 D07-G P5 3
## 11 4.P5D1_S4 1 D07-G P5 4
## 12 5.P5D3_S5 3 D07-G P5 5
## 13 6.P5D9_S6 9 D07-G P5 6
## 14 7.KD6NT_S7 -2 D07-G P6 7
## 15 8.P6D0_S8 0 D07-G P6 8
## 16 9.P6D1_S9 1 D07-G P6 9
FYI this file includes RNAseq from a different project (APEXseq); we’ll filter that out later
d15_file = file.path(ous_dir, "Feb22_RNAseq/02featureCounts/late_adipo_feb22.counts")
day15_tab = read.delim(d15_file, skip=1,header=T, stringsAsFactors = F)
colnames(day15_tab) = gsub("output.01hisat.","", gsub(".sorted.bam","",colnames(day15_tab)))
colnames(day15_tab)
## [1] "Geneid" "Chr" "Start" "End" "Strand"
## [6] "Length" "1.21405_S22" "2.21406_S30" "3.21407_S38" "4.21408_S45"
## [11] "5.21409_S52" "6.21410_S1" "7.21411_S8" "8.21412_S15" "9.21413_S23"
## [16] "10.21414_S31" "11.21415_S39" "12.21416_S46" "13.21417_S53" "14.21418_S2"
## [21] "15.21419_S9" "16.21420_S16" "17.21421_S24" "18.21422_S32" "19.21423_S40"
## [26] "20.21424_S47" "21.21425_S54" "22.21426_S3" "23.21427_S10" "24.21428_S17"
day15_info = read.delim(file.path(ous_dir, "Feb22_RNAseq/late_adipo_sample_info.txt"),
header=T)
day15_info$sample_id = gsub("-",".", day15_info$sample_id)
day15_info$time = factor(gsub("D","", day15_info$time))
day15_info$donor = gsub("D07G","D07-G", gsub("D13A","D13-A", day15_info$donor))
colnames(day15_info) = gsub("rep", "biorep", colnames(day15_info))
#use cell_type column as experiment delimiter
colnames(day15_info) = gsub("cell_type", "exp", colnames(day15_info))
day15_info = day15_info[!(colnames(day15_info) %in% c("bio.condition","researcher"))]
colnames(day15_info); day15_info$sample_id
## [1] "time" "biorep" "exp" "construct" "donor"
## [6] "separation" "sample_id"
## [1] "1.21405_S22" "2.21406_S30" "3.21407_S38" "4.21408_S45" "5.21409_S52"
## [6] "6.21410_S1" "7.21411_S8" "8.21412_S15" "9.21413_S23" "10.21414_S31"
## [11] "11.21415_S39" "12.21416_S46" "13.21417_S53" "14.21418_S2" "15.21419_S9"
## [16] "16.21420_S16" "17.21421_S24" "18.21422_S32" "19.21423_S40" "20.21424_S47"
## [21] "21.21425_S54" "22.21426_S3" "23.21427_S10" "24.21428_S17"
Edits to day15 file to fit: - remove D from time; add dash to donor - change column names: rep to biorep; cell_type to exp - remove researcher, bio.condition
Merging the fc tables directly
early = merge(d13_tab, d7_tab)
colnames(early); dim(early)
## [1] "Geneid" "Chr" "Start" "End"
## [5] "Strand" "Length" "6.19190_S20" "7.19191_S1"
## [9] "8.19192_S3" "9.19193_S6" "10.19194_S9" "11.19195_S12"
## [13] "12.19196_S15" "13.19197_S18" "14.19198_S21" "15.19199_S2"
## [17] "1.19200_S5" "2.19201_S8" "3.19202_S11" "4.19203_S14"
## [21] "5.19204_S17" "16.19205_S4" "17.19206_S7" "18.19207_S10"
## [25] "19.19208_S13" "20.S2.KD5NT_S16" "21.S7.KD6NT_S19" "22.S12.KD3NT_S22"
## [29] "1.KD4NT_S1" "10.P6D3_S10" "11.P6D9_S11" "12.KD3NT_S12"
## [33] "13.P8D0_S13" "14.P8D1_S14" "15.P8D3_S15" "16.P8D9_S16"
## [37] "2.KD5NT_S2" "3.P5D0_S3" "4.P5D1_S4" "5.P5D3_S5"
## [41] "6.P5D9_S6" "7.KD6NT_S7" "8.P6D0_S8" "9.P6D1_S9"
## [1] 58735 44
#Add experiment separator
d13_info$exp = 2
d7_info$exp = 1
colnames(d13_info)
## [1] "sample_id" "time" "donor" "biorep" "Tube.Label"
## [6] "exp"
colnames(d7_info)
## [1] "sample_id" "time" "donor" "biorep" "Tube.Label"
## [6] "exp"
early_info = rbind(d13_info, d7_info)
early_info$construct = "native"
early_info$separation = "bulk"
early_info = early_info[colnames(early_info) != "Tube.Label"]
table(paste(early_info$time, early_info$donor, early_info$exp))
##
## -2 D07-G 1 -2 D07-G 2 -2 D13-A 2 0 D07-G 1 0 D13-A 2 1 D07-G 1 1 D13-A 2
## 4 3 3 3 3 3 3
## 15 D07-G 2 15 D13-A 2 3 D07-G 1 3 D13-A 2 9 D07-G 1 9 D13-A 2
## 2 2 3 3 3 3
Merge sample information for D7 and d13; remove tube.label column and add columns construct=native and separation=bulk. Also add exp column; since d7 and d13 have overlapping samples
Merging in day15 now
whole = merge(early, day15_tab)
dim(whole)
## [1] 58735 68
whole_info = rbind(early_info, day15_info)
str(whole_info); tail(whole_info)
## 'data.frame': 62 obs. of 7 variables:
## $ sample_id : chr "6.19190_S20" "7.19191_S1" "8.19192_S3" "9.19193_S6" ...
## $ time : chr "-2" "0" "1" "3" ...
## $ donor : chr "D13-A" "D13-A" "D13-A" "D13-A" ...
## $ biorep : chr "7" "7" "7" "7" ...
## $ exp : chr "2" "2" "2" "2" ...
## $ construct : chr "native" "native" "native" "native" ...
## $ separation: chr "bulk" "bulk" "bulk" "bulk" ...
## sample_id time donor biorep exp construct separation
## 57 19.21423_S40 15 D07-G P7 native native bulk
## 58 20.21424_S47 15 D07-G P8_2 native native bulk
## 59 21.21425_S54 15 D07-G P8_1 native native bulk
## 60 22.21426_S3 15 D13-A P7 native native bulk
## 61 23.21427_S10 15 D13-A P6 native native bulk
## 62 24.21428_S17 15 D13-A P7 native native bulk
Save combined read counts so we don’t need to do this again.
write.table(whole, file.path(ous_dir, "Feb22_RNAseq/03limma/late_adipo_and_D7&D13_native_rnaseq.counts"), sep="\t", row.names = F)
write.table(whole_info, file.path(ous_dir, "Feb22_RNAseq/03limma/late_adipo_and_D7&D13_native_rnaseq_info.tab"), sep="\t", row.names = F)
remove additional samples.
whole_info$time.donor = paste(whole_info$time, whole_info$donor, sep=".")
whole_info$group = paste(whole_info$time.donor, whole_info$separation, sep=".")
whole_ob = DGEList(counts=data.matrix(whole[grep("[[:digit:]]", colnames(whole))]),
genes= whole[c("Geneid", "Length")], samples = whole_info)
rownames(whole_ob$counts) = whole_ob$genes$Geneid
summary (whole_ob)
## Length Class Mode
## counts 3641570 -none- numeric
## samples 11 data.frame list
## genes 2 data.frame list
plotMDS(whole_ob, labels = whole_ob$samples$group, gene.selection = "common")
plotMDS(whole_ob, labels = whole_ob$samples$exp, gene.selection = "common")
#Remove uncultured mature adipocytes from tissue extract
series_ob = whole_ob[,!(whole_ob$samples$exp == 2 & whole_ob$samples$time == 15)]
table(paste(series_ob$samples$group, whole_ob$samples$exp))
##
## -2.D07-G.bulk 1 -2.D07-G.bulk 2 -2.D13-A.bulk 2
## 2 5 3
## -2.D13-A.bulk native 0.D07-G.bulk 1 0.D13-A.bulk 2
## 1 3 3
## 0.D13-A.bulk native 1.D07-G.bulk 1 1.D13-A.bulk 2
## 1 3 3
## 1.D13-A.bulk native 15.D07-G.bulk 1 15.D07-G.bulk APEX
## 1 4 2
## 15.D07-G.bulk native 15.D07-G.floating APEX 15.D13-A.bulk native
## 3 3 3
## 15.D13-A.floating APEX 15.D13-A.floating native 21.D07-G.bulk APEX
## 1 2 6
## 3.D07-G.bulk 1 3.D07-G.bulk 2 3.D13-A.bulk 2
## 2 1 3
## 3.D13-A.bulk native 9.D07-G.bulk 1 9.D07-G.bulk 2
## 1 2 1
## 9.D13-A.bulk 2
## 3
#remove APEX samples
series_ob = series_ob[,series_ob$samples$construct == "native"]
#Check column names are the same
series_ob$samples
## group lib.size norm.factors sample_id time
## 6.19190_S20 -2.D13-A.bulk 43343287 1 6.19190_S20 -2
## 7.19191_S1 0.D13-A.bulk 53954862 1 7.19191_S1 0
## 8.19192_S3 1.D13-A.bulk 44816024 1 8.19192_S3 1
## 9.19193_S6 3.D13-A.bulk 27650074 1 9.19193_S6 3
## 10.19194_S9 9.D13-A.bulk 33681648 1 10.19194_S9 9
## 11.19195_S12 -2.D13-A.bulk 44069380 1 11.19195_S12 -2
## 12.19196_S15 0.D13-A.bulk 33748947 1 12.19196_S15 0
## 13.19197_S18 1.D13-A.bulk 35268416 1 13.19197_S18 1
## 14.19198_S21 3.D13-A.bulk 38990845 1 14.19198_S21 3
## 15.19199_S2 9.D13-A.bulk 31648240 1 15.19199_S2 9
## 1.19200_S5 -2.D13-A.bulk 24715231 1 1.19200_S5 -2
## 2.19201_S8 0.D13-A.bulk 25887790 1 2.19201_S8 0
## 3.19202_S11 1.D13-A.bulk 67946021 1 3.19202_S11 1
## 4.19203_S14 3.D13-A.bulk 51335376 1 4.19203_S14 3
## 5.19204_S17 9.D13-A.bulk 41570892 1 5.19204_S17 9
## 20.S2.KD5NT_S16 -2.D07-G.bulk 28227567 1 20.S2.KD5NT_S16 -2
## 21.S7.KD6NT_S19 -2.D07-G.bulk 30639099 1 21.S7.KD6NT_S19 -2
## 22.S12.KD3NT_S22 -2.D07-G.bulk 39314312 1 22.S12.KD3NT_S22 -2
## 1.KD4NT_S1 -2.D07-G.bulk 40875335 1 1.KD4NT_S1 -2
## 10.P6D3_S10 3.D07-G.bulk 37487858 1 10.P6D3_S10 3
## 11.P6D9_S11 9.D07-G.bulk 45348216 1 11.P6D9_S11 9
## 12.KD3NT_S12 -2.D07-G.bulk 52250725 1 12.KD3NT_S12 -2
## 13.P8D0_S13 0.D07-G.bulk 50473193 1 13.P8D0_S13 0
## 14.P8D1_S14 1.D07-G.bulk 37355443 1 14.P8D1_S14 1
## 15.P8D3_S15 3.D07-G.bulk 36821008 1 15.P8D3_S15 3
## 16.P8D9_S16 9.D07-G.bulk 25192201 1 16.P8D9_S16 9
## 2.KD5NT_S2 -2.D07-G.bulk 39256967 1 2.KD5NT_S2 -2
## 3.P5D0_S3 0.D07-G.bulk 35885732 1 3.P5D0_S3 0
## 4.P5D1_S4 1.D07-G.bulk 44499333 1 4.P5D1_S4 1
## 5.P5D3_S5 3.D07-G.bulk 45252924 1 5.P5D3_S5 3
## 6.P5D9_S6 9.D07-G.bulk 38835735 1 6.P5D9_S6 9
## 7.KD6NT_S7 -2.D07-G.bulk 37829067 1 7.KD6NT_S7 -2
## 8.P6D0_S8 0.D07-G.bulk 38988386 1 8.P6D0_S8 0
## 9.P6D1_S9 1.D07-G.bulk 40833777 1 9.P6D1_S9 1
## 13.21417_S53 15.D07-G.floating 39520835 1 13.21417_S53 15
## 14.21418_S2 15.D07-G.floating 43198916 1 14.21418_S2 15
## 15.21419_S9 15.D07-G.floating 53192791 1 15.21419_S9 15
## 16.21420_S16 15.D13-A.floating 58757001 1 16.21420_S16 15
## 17.21421_S24 15.D13-A.floating 64220795 1 17.21421_S24 15
## 18.21422_S32 15.D13-A.floating 65300951 1 18.21422_S32 15
## 19.21423_S40 15.D07-G.bulk 43513597 1 19.21423_S40 15
## 20.21424_S47 15.D07-G.bulk 68529570 1 20.21424_S47 15
## 21.21425_S54 15.D07-G.bulk 67338531 1 21.21425_S54 15
## 22.21426_S3 15.D13-A.bulk 28524813 1 22.21426_S3 15
## 23.21427_S10 15.D13-A.bulk 51405957 1 23.21427_S10 15
## 24.21428_S17 15.D13-A.bulk 52000236 1 24.21428_S17 15
## donor biorep exp construct separation time.donor
## 6.19190_S20 D13-A 7 2 native bulk -2.D13-A
## 7.19191_S1 D13-A 7 2 native bulk 0.D13-A
## 8.19192_S3 D13-A 7 2 native bulk 1.D13-A
## 9.19193_S6 D13-A 7 2 native bulk 3.D13-A
## 10.19194_S9 D13-A 7 2 native bulk 9.D13-A
## 11.19195_S12 D13-A 8 2 native bulk -2.D13-A
## 12.19196_S15 D13-A 8 2 native bulk 0.D13-A
## 13.19197_S18 D13-A 8 2 native bulk 1.D13-A
## 14.19198_S21 D13-A 8 2 native bulk 3.D13-A
## 15.19199_S2 D13-A 8 2 native bulk 9.D13-A
## 1.19200_S5 D13-A 5 2 native bulk -2.D13-A
## 2.19201_S8 D13-A 5 2 native bulk 0.D13-A
## 3.19202_S11 D13-A 5 2 native bulk 1.D13-A
## 4.19203_S14 D13-A 5 2 native bulk 3.D13-A
## 5.19204_S17 D13-A 5 2 native bulk 9.D13-A
## 20.S2.KD5NT_S16 D07-G 5 2 native bulk -2.D07-G
## 21.S7.KD6NT_S19 D07-G 6 2 native bulk -2.D07-G
## 22.S12.KD3NT_S22 D07-G 8 2 native bulk -2.D07-G
## 1.KD4NT_S1 D07-G P4 1 native bulk -2.D07-G
## 10.P6D3_S10 D07-G P6 1 native bulk 3.D07-G
## 11.P6D9_S11 D07-G P6 1 native bulk 9.D07-G
## 12.KD3NT_S12 D07-G P8 1 native bulk -2.D07-G
## 13.P8D0_S13 D07-G P8 1 native bulk 0.D07-G
## 14.P8D1_S14 D07-G P8 1 native bulk 1.D07-G
## 15.P8D3_S15 D07-G P8 1 native bulk 3.D07-G
## 16.P8D9_S16 D07-G P8 1 native bulk 9.D07-G
## 2.KD5NT_S2 D07-G P5 1 native bulk -2.D07-G
## 3.P5D0_S3 D07-G P5 1 native bulk 0.D07-G
## 4.P5D1_S4 D07-G P5 1 native bulk 1.D07-G
## 5.P5D3_S5 D07-G P5 1 native bulk 3.D07-G
## 6.P5D9_S6 D07-G P5 1 native bulk 9.D07-G
## 7.KD6NT_S7 D07-G P6 1 native bulk -2.D07-G
## 8.P6D0_S8 D07-G P6 1 native bulk 0.D07-G
## 9.P6D1_S9 D07-G P6 1 native bulk 1.D07-G
## 13.21417_S53 D07-G P6 native native floating 15.D07-G
## 14.21418_S2 D07-G P8_1 native native floating 15.D07-G
## 15.21419_S9 D07-G P8_2 native native floating 15.D07-G
## 16.21420_S16 D13-A P6_1 native native floating 15.D13-A
## 17.21421_S24 D13-A P6_2 native native floating 15.D13-A
## 18.21422_S32 D13-A P7 native native floating 15.D13-A
## 19.21423_S40 D07-G P7 native native bulk 15.D07-G
## 20.21424_S47 D07-G P8_2 native native bulk 15.D07-G
## 21.21425_S54 D07-G P8_1 native native bulk 15.D07-G
## 22.21426_S3 D13-A P7 native native bulk 15.D13-A
## 23.21427_S10 D13-A P6 native native bulk 15.D13-A
## 24.21428_S17 D13-A P7 native native bulk 15.D13-A
colnames(series_ob)
## [1] "6.19190_S20" "7.19191_S1" "8.19192_S3" "9.19193_S6"
## [5] "10.19194_S9" "11.19195_S12" "12.19196_S15" "13.19197_S18"
## [9] "14.19198_S21" "15.19199_S2" "1.19200_S5" "2.19201_S8"
## [13] "3.19202_S11" "4.19203_S14" "5.19204_S17" "20.S2.KD5NT_S16"
## [17] "21.S7.KD6NT_S19" "22.S12.KD3NT_S22" "1.KD4NT_S1" "10.P6D3_S10"
## [21] "11.P6D9_S11" "12.KD3NT_S12" "13.P8D0_S13" "14.P8D1_S14"
## [25] "15.P8D3_S15" "16.P8D9_S16" "2.KD5NT_S2" "3.P5D0_S3"
## [29] "4.P5D1_S4" "5.P5D3_S5" "6.P5D9_S6" "7.KD6NT_S7"
## [33] "8.P6D0_S8" "9.P6D1_S9" "13.21417_S53" "14.21418_S2"
## [37] "15.21419_S9" "16.21420_S16" "17.21421_S24" "18.21422_S32"
## [41] "19.21423_S40" "20.21424_S47" "21.21425_S54" "22.21426_S3"
## [45] "23.21427_S10" "24.21428_S17"
Everything looks good. 1. MDSplot separates timepoints well 2. the RLE plot shows that median counts between libraries are 0, meaning library size and distribution is similar between libraries. Therefore normalisation between experiments is not necessary.
plotMDS(series_ob, labels = series_ob$samples$time, gene.selection = "common")
plotRLE(series_ob$counts, outline=FALSE, col=series_ob$samples$group, main="Before filtering")
## Filtering
plotSmear(series_ob, main = "Before Filtering")
dim(series_ob)
## [1] 58735 46
filt_series = series_ob[filterByExpr(series_ob),, keep.lib.sizes=FALSE]
dim(filt_series) #21174 genes kept (about 1k more genes that the timecourse to day9)
## [1] 21174 46
#The filter is approxiamtely 10/ libsize in millions
10/(median(series_ob$samples$lib.size)/1000000) #0.245 minimum CPM
## [1] 0.2447707
#sanity check that gene_names match up
filt_series$counts[filt_series$genes$Geneid == "ENSG00000228630",] #HOTAIR
## 6.19190_S20 7.19191_S1 8.19192_S3 9.19193_S6
## 67.00 60.00 61.00 26.00
## 10.19194_S9 11.19195_S12 12.19196_S15 13.19197_S18
## 18.00 39.00 52.00 36.00
## 14.19198_S21 15.19199_S2 1.19200_S5 2.19201_S8
## 41.00 15.00 18.00 46.00
## 3.19202_S11 4.19203_S14 5.19204_S17 20.S2.KD5NT_S16
## 105.00 37.00 12.00 200.00
## 21.S7.KD6NT_S19 22.S12.KD3NT_S22 1.KD4NT_S1 10.P6D3_S10
## 251.90 313.00 276.00 325.00
## 11.P6D9_S11 12.KD3NT_S12 13.P8D0_S13 14.P8D1_S14
## 96.00 427.00 770.25 462.00
## 15.P8D3_S15 16.P8D9_S16 2.KD5NT_S2 3.P5D0_S3
## 237.00 106.00 196.00 375.00
## 4.P5D1_S4 5.P5D3_S5 6.P5D9_S6 7.KD6NT_S7
## 454.00 438.00 151.00 342.00
## 8.P6D0_S8 9.P6D1_S9 13.21417_S53 14.21418_S2
## 595.00 330.00 71.00 41.00
## 15.21419_S9 16.21420_S16 17.21421_S24 18.21422_S32
## 71.00 26.00 11.00 15.00
## 19.21423_S40 20.21424_S47 21.21425_S54 22.21426_S3
## 339.00 797.12 844.00 26.00
## 23.21427_S10 24.21428_S17
## 78.00 47.00
filt_series$counts[filt_series$genes$Geneid == "ENSG00000132170",] #PPARG
## 6.19190_S20 7.19191_S1 8.19192_S3 9.19193_S6
## 1303.10 1667.00 3994.00 6675.10
## 10.19194_S9 11.19195_S12 12.19196_S15 13.19197_S18
## 7365.24 1319.10 557.12 3194.31
## 14.19198_S21 15.19199_S2 1.19200_S5 2.19201_S8
## 7950.67 9278.10 542.00 671.00
## 3.19202_S11 4.19203_S14 5.19204_S17 20.S2.KD5NT_S16
## 4561.33 8161.83 11953.80 392.20
## 21.S7.KD6NT_S19 22.S12.KD3NT_S22 1.KD4NT_S1 10.P6D3_S10
## 1183.37 1248.50 743.00 3307.83
## 11.P6D9_S11 12.KD3NT_S12 13.P8D0_S13 14.P8D1_S14
## 5122.53 976.17 645.00 1255.00
## 15.P8D3_S15 16.P8D9_S16 2.KD5NT_S2 3.P5D0_S3
## 3011.50 3613.41 537.00 262.00
## 4.P5D1_S4 5.P5D3_S5 6.P5D9_S6 7.KD6NT_S7
## 886.00 3387.60 6195.75 793.00
## 8.P6D0_S8 9.P6D1_S9 13.21417_S53 14.21418_S2
## 537.10 1261.10 20687.05 21560.08
## 15.21419_S9 16.21420_S16 17.21421_S24 18.21422_S32
## 19293.64 30703.87 32751.01 38751.89
## 19.21423_S40 20.21424_S47 21.21425_S54 22.21426_S3
## 10886.47 12895.56 13914.89 8218.48
## 23.21427_S10 24.21428_S17
## 16600.64 18475.29
plotSmear(filt_series, main = "After Filtering")
Donor vs donor this looks like very nice filtering (above, at any one time point some genes will be missing).
#Plot donor v donor -> a few low exression genes still but predominantly good filtering
two_filt = filt_series
two_filt$samples$group = as.factor(filt_series$samples$donor)
plotSmear(two_filt, main = "After Filtering")
#library sizes are different, but relative log expression is
#now ~ normally distributed within libraries
plotRLE(filt_series$counts, outline=FALSE,
col=filt_series$samples$group, main="After filtering")
plotMDS(filt_series, labels = filt_series$samples$group, gene.selection = "common")
filt_series = calcNormFactors(filt_series,method = "TMM")
filt_series$samples$norm.factors
## [1] 1.2453947 1.2143844 1.1251937 1.1279176 0.9458378 1.3221153 1.0006515
## [8] 1.1403659 0.9928395 1.0445354 1.3664816 1.1431881 1.2621641 1.0546869
## [15] 1.0419248 1.2097142 1.3116360 1.2920292 1.1244045 0.9951992 0.7803828
## [22] 1.1123288 0.9314547 1.0344064 0.8685081 0.8944916 1.1077081 0.8561931
## [29] 0.9462953 0.9419874 0.8237669 1.1048697 0.9583928 0.8541696 0.7900846
## [36] 0.6108555 0.6834306 0.8374520 0.7701180 0.8412588 0.9839876 1.0048101
## [43] 1.0123225 0.9974551 0.9286849 1.0430259
plotRLE(cpm(filt_series$counts), outline=FALSE,
col=filt_series$samples$group, main="After TMM normalisation (no libsize)")
Library size is approximately the same after CPM normalisation, but it still needs to be adjusted directly for library size:
plotRLE(cpm(filt_series, normalized.lib.sizes = TRUE), outline=FALSE,
col=filt_series$samples$group, main="After TMM normalisation (with libsize)")
library(biomaRt)
mart <- biomaRt::useMart(biomart = "ensembl",
dataset = "hsapiens_gene_ensembl",
host = "https://jan2019.archive.ensembl.org")
annot = getBM(c("external_gene_name", "description", "ensembl_gene_id"),
filters = "ensembl_gene_id",
values = filt_series$genes$Geneid,
mart = mart)
head(annot, n=2); dim(annot)
## external_gene_name description
## 1 TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2 TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
## ensembl_gene_id
## 1 ENSG00000000003
## 2 ENSG00000000005
## [1] 21174 3
#Tidying up the annot table
annot$description = gsub("\\[Source:.+\\]", "", annot$description)
colnames(annot)[1] = "gene_name"
#Add gene names to filt_series
filt_series$genes = merge(filt_series$genes, annot, by.x= "Geneid",
by.y = "ensembl_gene_id", sort=FALSE)
#head(filt_series$genes)
#make sure length is numeric
filt_series$genes$Length = as.numeric(filt_series$genes$Length)
norm_series = voom(filt_series, plot = TRUE)
head(norm_series$design, n=2) #groups are time.donor
## (Intercept) group-2.D13-A.bulk group0.D07-G.bulk group0.D13-A.bulk
## 6.19190_S20 1 1 0 0
## 7.19191_S1 1 0 0 1
## group1.D07-G.bulk group1.D13-A.bulk group15.D07-G.bulk
## 6.19190_S20 0 0 0
## 7.19191_S1 0 0 0
## group15.D07-G.floating group15.D13-A.bulk group15.D13-A.floating
## 6.19190_S20 0 0 0
## 7.19191_S1 0 0 0
## group3.D07-G.bulk group3.D13-A.bulk group9.D07-G.bulk
## 6.19190_S20 0 0 0
## 7.19191_S1 0 0 0
## group9.D13-A.bulk
## 6.19190_S20 0
## 7.19191_S1 0
summary(norm_series$E[,1])#log normlisation min -6, max 14
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.754 -1.468 1.972 1.594 4.635 14.871
How to separate floating and bulk adipocytes? Perhaps I’ll need to run the analysis separately, but start together with the separation as a factor.
#1 check the default design
colnames(norm_series$design)
## [1] "(Intercept)" "group-2.D13-A.bulk" "group0.D07-G.bulk"
## [4] "group0.D13-A.bulk" "group1.D07-G.bulk" "group1.D13-A.bulk"
## [7] "group15.D07-G.bulk" "group15.D07-G.floating" "group15.D13-A.bulk"
## [10] "group15.D13-A.floating" "group3.D07-G.bulk" "group3.D13-A.bulk"
## [13] "group9.D07-G.bulk" "group9.D13-A.bulk"
head(filt_series$samples, n=2)
## group lib.size norm.factors sample_id time donor biorep
## 6.19190_S20 -2.D13-A.bulk 43327502 1.245395 6.19190_S20 -2 D13-A 7
## 7.19191_S1 0.D13-A.bulk 53932903 1.214384 7.19191_S1 0 D13-A 7
## exp construct separation time.donor
## 6.19190_S20 2 native bulk -2.D13-A
## 7.19191_S1 2 native bulk 0.D13-A
pair_design = model.matrix(~0+filt_series$samples$group)
colnames(pair_design) = paste("day", gsub("-",".",levels(filt_series$samples$group)), sep="")
rownames(pair_design) = filt_series$samples$sample_id
head(pair_design, n=2)
## day.2.D07.G.bulk day.2.D13.A.bulk day0.D07.G.bulk day0.D13.A.bulk
## 6.19190_S20 0 1 0 0
## 7.19191_S1 0 0 0 1
## day1.D07.G.bulk day1.D13.A.bulk day15.D07.G.bulk
## 6.19190_S20 0 0 0
## 7.19191_S1 0 0 0
## day15.D07.G.floating day15.D13.A.bulk day15.D13.A.floating
## 6.19190_S20 0 0 0
## 7.19191_S1 0 0 0
## day3.D07.G.bulk day3.D13.A.bulk day9.D07.G.bulk day9.D13.A.bulk
## 6.19190_S20 0 0 0 0
## 7.19191_S1 0 0 0 0
Using bulk day15 samples time_effect_donor7 + time_effect_donor13 was the pvalue used at 0.01 to select genes for dpgp
pair_fit = lmFit(norm_series, design=pair_design)
contrasts = makeContrasts(time_effect_donor7 = (day0.D07.G.bulk - day.2.D07.G.bulk ) + #D-2 -> D0 effect
(day1.D07.G.bulk - day0.D07.G.bulk) + #D0 -> D1 effect
(day3.D07.G.bulk - day1.D07.G.bulk) + #D1 -> D3 effect
(day9.D07.G.bulk - day3.D07.G.bulk) + #D3 -> D9 effect
(day15.D07.G.bulk - day9.D07.G.bulk),
time_effect_donor13 = (day0.D13.A.bulk - day.2.D13.A.bulk ) + #D-2 -> D0 effect
(day1.D13.A.bulk - day0.D13.A.bulk) + #D0 -> D1 effect
(day3.D13.A.bulk - day1.D13.A.bulk) + #D1 -> D3 effect
(day9.D13.A.bulk - day3.D13.A.bulk) + #D3 -> D9 effect
(day15.D13.A.bulk - day9.D13.A.bulk), #D9 -> D15 effect
time_effect_per_donor = ((day0.D07.G.bulk - day.2.D07.G.bulk ) + #donor 7
(day1.D07.G.bulk - day0.D07.G.bulk) +
(day3.D07.G.bulk - day1.D07.G.bulk) +
(day9.D07.G.bulk - day3.D07.G.bulk) +
(day15.D07.G.bulk - day9.D07.G.bulk)) - #minus
((day0.D13.A.bulk - day.2.D13.A.bulk ) + #donor 13
(day1.D13.A.bulk - day0.D13.A.bulk) +
(day3.D13.A.bulk - day1.D13.A.bulk) +
(day9.D13.A.bulk - day3.D13.A.bulk) +
(day15.D13.A.bulk - day9.D13.A.bulk)),
levels=pair_design)
special_fit = contrasts.fit(pair_fit,contrasts)
special_fit = eBayes(special_fit, robust = TRUE)
summary(decideTests(special_fit, method="separate", adjust.method = "BH", p.value = 0.01)) #Adjusted for contrast
## time_effect_donor7 time_effect_donor13 time_effect_per_donor
## Down 4563 4558 67
## NotSig 11929 12246 20976
## Up 4682 4370 131
summary(decideTests(special_fit ,method="global", adjust.method = "BH")) #Unadjusted for contrast
## time_effect_donor7 time_effect_donor13 time_effect_per_donor
## Down 5336 5534 1399
## NotSig 10166 10173 18229
## Up 5672 5467 1546
Interesting, the time effect is less significant in donor13. The time effect is similar rather than different between donors -> most genes respons similarly to adipogenic cocktail, regardless of depot/donor.
Adding an extra timepoint adds more genes to the DE list; 11,639 genes at p < 0.01 (basically 50% of all detected genes).
comb = topTable(special_fit, number = nrow(pair_fit$genes), coef = c("time_effect_donor7","time_effect_donor13"))
dim(comb[comb$adj.P.Val < 0.05,]) #More than half of all genes change; most will be the same
## [1] 14236 10
dim(comb[comb$adj.P.Val < 0.01,])
## [1] 11639 10
volcanoplot(special_fit, coef = c(1,2), highlight =10, names = special_fit$genes$gene_name,
main="Time effect (combined across donors)", xlab="log2FC ABDO <---> GLUT")
volcanoplot(special_fit, coef = c(1,2), highlight =11639, names = special_fit$genes$gene_name,
main="Time effect (combined across donors)", xlab="log2FC ABDO <---> GLUT")
volcanoplot(special_fit, coef = c(1,2),
main="Time effect (combined across donors)", xlab="log2FC ABDO <---> GLUT")
head(comb)
## Geneid Length gene_name
## ENSG00000151726 ENSG00000151726 6284 ACSL1
## ENSG00000099194 ENSG00000099194 5362 SCD
## ENSG00000091513 ENSG00000091513 26199 TF
## ENSG00000056998 ENSG00000056998 3655 GYG2
## ENSG00000042445 ENSG00000042445 4005 RETSAT
## ENSG00000131471 ENSG00000131471 4524 AOC3
## description
## ENSG00000151726 acyl-CoA synthetase long chain family member 1
## ENSG00000099194 stearoyl-CoA desaturase
## ENSG00000091513 transferrin
## ENSG00000056998 glycogenin 2
## ENSG00000042445 retinol saturase
## ENSG00000131471 amine oxidase, copper containing 3
## time_effect_donor7 time_effect_donor13 AveExpr F
## ENSG00000151726 6.328527 6.587910 8.810891 1311.9493
## ENSG00000099194 8.374125 8.116002 11.807874 977.5091
## ENSG00000091513 15.353000 14.232266 3.105257 772.2546
## ENSG00000056998 5.435924 4.592658 4.980263 682.1097
## ENSG00000042445 3.744336 4.451633 7.100134 664.3586
## ENSG00000131471 11.794312 11.680727 6.379407 583.7300
## P.Value adj.P.Val
## ENSG00000151726 4.270636e-36 9.042645e-32
## ENSG00000099194 1.366785e-33 1.447015e-29
## ENSG00000091513 1.130334e-31 7.977894e-28
## ENSG00000056998 9.522686e-31 5.040834e-27
## ENSG00000042445 1.558533e-30 6.600076e-27
## ENSG00000131471 2.068864e-29 6.793301e-26
write.table(comb, sep="\t", row.names=FALSE,
file.path(ous_dir, "Feb22_RNAseq/03limma/time_effect_perdonor.tab"))
Still, at p < 0.05 about 8k genes hit sig. in both donors separately, and 3k each only hit sig. in one donor. For the HOTAIR project we didn’t want to select consensus DE genes, because its only DE in Glut. For the nucleolus project it might have made sense to, but to match what’s gone before I don’t want to make it too complicated.
donor7 = topTable(special_fit, number = nrow(pair_fit$genes), coef = c("time_effect_donor7"), adjust.method = "BH")
donor13 = topTable(special_fit, number = nrow(pair_fit$genes), coef = c("time_effect_donor13"))
donor7 = donor7[donor7$adj.P.Val < 0.05,]
donor13 = donor13[donor13$adj.P.Val < 0.05,]
dim(donor7);dim(donor13)
## [1] 11472 10
## [1] 11512 10
both_donors = merge(donor7, donor13, all=FALSE, by="Geneid"); dim(both_donors)
## [1] 8152 19
just13 = donor13[!(donor13$Geneid %in% donor7$Geneid),]
dim(just13)
## [1] 3360 10
just7 = donor7[!(donor7$Geneid %in% donor13$Geneid),]
dim(just7)
## [1] 3320 10
Other plots to look at the difference
library(ggplot2)
ggplot(comb, aes(y=AveExpr, x=-log10(P.Value))) + geom_point(alpha=0.1) +
geom_point(data=comb[1:5000,], colour="red", alpha=0.1)
logFC time pairs for each donor separately (so 8 total columns) Also adding a bulk to floating test, since floating cells are theoretically the most mature subset of bulk cells.
pair_cont = makeContrasts(#Donor 7 changes over time
d.2_to_d0.D7 = day0.D07.G.bulk - day.2.D07.G.bulk,
d0_to_d1.D7 = day1.D07.G.bulk - day0.D07.G.bulk,
d1_to_d3.D7 = day3.D07.G.bulk - day1.D07.G.bulk,
d3_to_d9.D7= day9.D07.G.bulk - day3.D07.G.bulk,
d9_to_d15.D7.bulk = day15.D07.G.bulk - day9.D07.G.bulk,
d9_to_d15.D7.floating = day15.D07.G.floating - day9.D07.G.bulk,
d15.bulk_to_floating.D7 = day15.D07.G.floating - day15.D07.G.bulk,
#Donor 13 changes over time
d.2_to_d0.D13 = day0.D13.A.bulk - day.2.D13.A.bulk,
d0_to_d1.D13 = day1.D13.A.bulk - day0.D13.A.bulk,
d1_to_d3.D13 = day3.D13.A.bulk - day1.D13.A.bulk,
d3_to_d9.D13 = day9.D13.A.bulk - day3.D13.A.bulk,
d9_to_d15.D13.bulk = day15.D13.A.bulk - day9.D13.A.bulk,
d9_to_d15.D13.floating = day15.D13.A.floating - day9.D13.A.bulk,
d15.bulk_to_floating.D13 = day15.D13.A.floating - day15.D13.A.bulk,
levels=pair_design)
pair_fit = contrasts.fit(pair_fit,pair_cont)
pair_fit = eBayes(pair_fit, robust = TRUE)
pair_tab = topTable(pair_fit, number = nrow(pair_fit$genes))
nrow(pair_tab[pair_tab$adj.P.Val < 0.01,])
## [1] 18297
Donor 13 has very similar bulk and floating; D7 bulk and floating are very different :P
result = decideTests(pair_fit, p.value = 0.01)
summary(result)
## d.2_to_d0.D7 d0_to_d1.D7 d1_to_d3.D7 d3_to_d9.D7 d9_to_d15.D7.bulk
## Down 2489 1437 250 319 3353
## NotSig 15507 18900 20481 20466 14120
## Up 3178 837 443 389 3701
## d9_to_d15.D7.floating d15.bulk_to_floating.D7 d.2_to_d0.D13 d0_to_d1.D13
## Down 4357 3097 909 1301
## NotSig 12264 14860 19492 19021
## Up 4553 3217 773 852
## d1_to_d3.D13 d3_to_d9.D13 d9_to_d15.D13.bulk d9_to_d15.D13.floating
## Down 619 178 3067 3181
## NotSig 19845 20687 15334 14969
## Up 710 309 2773 3024
## d15.bulk_to_floating.D13
## Down 131
## NotSig 20871
## Up 172
info = data.frame(t(summary(result)))
colnames(info) = c("test", "result", "freq")
info$donor = gsub("d.*_to_(d.*|floating)\\.", "", gsub(".(bulk|floating)$", "", info$test))
info$timezone = factor(gsub("\\.D(7|13)", "", info$test),
levels = c("d.2_to_d0","d0_to_d1", "d1_to_d3" ,"d3_to_d9" ,"d9_to_d15.bulk", "d9_to_d15.floating","d15.bulk_to_floating"))
info$result = factor(info$result, levels=c("Up","Down", "NotSig"))
info
## test result freq donor timezone
## 1 d.2_to_d0.D7 Down 2489 D7 d.2_to_d0
## 2 d0_to_d1.D7 Down 1437 D7 d0_to_d1
## 3 d1_to_d3.D7 Down 250 D7 d1_to_d3
## 4 d3_to_d9.D7 Down 319 D7 d3_to_d9
## 5 d9_to_d15.D7.bulk Down 3353 D7 d9_to_d15.bulk
## 6 d9_to_d15.D7.floating Down 4357 D7 d9_to_d15.floating
## 7 d15.bulk_to_floating.D7 Down 3097 D7 d15.bulk_to_floating
## 8 d.2_to_d0.D13 Down 909 D13 d.2_to_d0
## 9 d0_to_d1.D13 Down 1301 D13 d0_to_d1
## 10 d1_to_d3.D13 Down 619 D13 d1_to_d3
## 11 d3_to_d9.D13 Down 178 D13 d3_to_d9
## 12 d9_to_d15.D13.bulk Down 3067 D13 d9_to_d15.bulk
## 13 d9_to_d15.D13.floating Down 3181 D13 d9_to_d15.floating
## 14 d15.bulk_to_floating.D13 Down 131 D13 d15.bulk_to_floating
## 15 d.2_to_d0.D7 NotSig 15507 D7 d.2_to_d0
## 16 d0_to_d1.D7 NotSig 18900 D7 d0_to_d1
## 17 d1_to_d3.D7 NotSig 20481 D7 d1_to_d3
## 18 d3_to_d9.D7 NotSig 20466 D7 d3_to_d9
## 19 d9_to_d15.D7.bulk NotSig 14120 D7 d9_to_d15.bulk
## 20 d9_to_d15.D7.floating NotSig 12264 D7 d9_to_d15.floating
## 21 d15.bulk_to_floating.D7 NotSig 14860 D7 d15.bulk_to_floating
## 22 d.2_to_d0.D13 NotSig 19492 D13 d.2_to_d0
## 23 d0_to_d1.D13 NotSig 19021 D13 d0_to_d1
## 24 d1_to_d3.D13 NotSig 19845 D13 d1_to_d3
## 25 d3_to_d9.D13 NotSig 20687 D13 d3_to_d9
## 26 d9_to_d15.D13.bulk NotSig 15334 D13 d9_to_d15.bulk
## 27 d9_to_d15.D13.floating NotSig 14969 D13 d9_to_d15.floating
## 28 d15.bulk_to_floating.D13 NotSig 20871 D13 d15.bulk_to_floating
## 29 d.2_to_d0.D7 Up 3178 D7 d.2_to_d0
## 30 d0_to_d1.D7 Up 837 D7 d0_to_d1
## 31 d1_to_d3.D7 Up 443 D7 d1_to_d3
## 32 d3_to_d9.D7 Up 389 D7 d3_to_d9
## 33 d9_to_d15.D7.bulk Up 3701 D7 d9_to_d15.bulk
## 34 d9_to_d15.D7.floating Up 4553 D7 d9_to_d15.floating
## 35 d15.bulk_to_floating.D7 Up 3217 D7 d15.bulk_to_floating
## 36 d.2_to_d0.D13 Up 773 D13 d.2_to_d0
## 37 d0_to_d1.D13 Up 852 D13 d0_to_d1
## 38 d1_to_d3.D13 Up 710 D13 d1_to_d3
## 39 d3_to_d9.D13 Up 309 D13 d3_to_d9
## 40 d9_to_d15.D13.bulk Up 2773 D13 d9_to_d15.bulk
## 41 d9_to_d15.D13.floating Up 3024 D13 d9_to_d15.floating
## 42 d15.bulk_to_floating.D13 Up 172 D13 d15.bulk_to_floating
ggplot(info) + geom_bar(stat="identity",aes(x=timezone,y=freq, fill=donor, group=donor), position="dodge") +
theme(axis.text.x = element_text(angle=45, hjust=1)) + facet_grid(result~., scales = "free_y")
ggplot(info[info$result != "NotSig",]) + geom_bar(stat="identity",aes(x=timezone,y=freq, fill=donor, group=donor), position="dodge") +
theme(axis.text.x = element_text(angle=45, hjust=1)) + facet_grid(result~.)
ggplot(info[info$result != "NotSig",]) + geom_bar(stat="summary", fun="sum",aes(x=timezone,y=freq, fill=donor, group=donor), position="dodge") +
theme(axis.text.x = element_text(angle=45, hjust=1)) + ylab("number sig genes")
Donor7 has more dynamic gene expression than donor13, particularly in cells becoming confluent and between bulk and floating adipocyte fractions. This means that when I do combined tests and analysis the effect is often driven by donor7 rather than donor13. It’s also notable that there is a huge difference in gene expression from day9 to day15, which will dominate over the more moderate effects seen in the early adipogenesis (from day0 to day9). It may make sense to streamline our timepoints or comparisons, to test more specific things. For example Pro -> D0 -> D9 -> D15
vennDiagram(result[,c("d.2_to_d0.D7", "d9_to_d15.D7.bulk", "d9_to_d15.D7.floating")], include="up")
vennDiagram(result[,c("d.2_to_d0.D7", "d9_to_d15.D7.bulk", "d9_to_d15.D7.floating")], include="down")
vennDiagram(result[,c("d9_to_d15.D7.bulk", "d9_to_d15.D7.floating", "d9_to_d15.D13.bulk","d9_to_d15.D13.floating")])
vennDiagram(result[,c( "d.2_to_d0.D7","d9_to_d15.D7.bulk", "d9_to_d15.D13.bulk")]) #day-2 to day0 changing genes often also change d9 to d15
#write.table(pair_tab, sep="\t", row.names=FALSE,
# file.path(limma_dir, "logFCs_timePairs.tab"))
Get an overview of these changes in a table form….
No genes are significant at every condition (time * donor). Few genes are significant at no condition (just 4650). Many genes are significant in 1 to 4 conditions, and fewer in 5 and more conditions.
dynamic = as.data.frame(result)
dynamic$sig_n = rowSums(result != 0)
dynamic$up_n = rowSums(result == 1)
dynamic$down_n = rowSums(result == -1)
head(dynamic)
## d.2_to_d0.D7 d0_to_d1.D7 d1_to_d3.D7 d3_to_d9.D7
## ENSG00000000003 0 0 0 0
## ENSG00000000005 0 0 0 0
## ENSG00000000419 0 0 0 0
## ENSG00000000457 0 0 0 0
## ENSG00000000460 -1 0 0 0
## ENSG00000000938 1 0 0 0
## d9_to_d15.D7.bulk d9_to_d15.D7.floating d15.bulk_to_floating.D7
## ENSG00000000003 1 1 0
## ENSG00000000005 0 0 0
## ENSG00000000419 0 0 0
## ENSG00000000457 0 0 0
## ENSG00000000460 0 0 0
## ENSG00000000938 0 1 1
## d.2_to_d0.D13 d0_to_d1.D13 d1_to_d3.D13 d3_to_d9.D13
## ENSG00000000003 0 0 0 0
## ENSG00000000005 0 0 0 1
## ENSG00000000419 0 0 0 0
## ENSG00000000457 0 0 0 0
## ENSG00000000460 -1 0 0 0
## ENSG00000000938 0 0 0 0
## d9_to_d15.D13.bulk d9_to_d15.D13.floating
## ENSG00000000003 0 0
## ENSG00000000005 0 0
## ENSG00000000419 0 0
## ENSG00000000457 0 0
## ENSG00000000460 0 0
## ENSG00000000938 1 1
## d15.bulk_to_floating.D13 sig_n up_n down_n
## ENSG00000000003 0 2 2 0
## ENSG00000000005 0 1 1 0
## ENSG00000000419 0 0 0 0
## ENSG00000000457 0 0 0 0
## ENSG00000000460 0 2 0 2
## ENSG00000000938 0 5 5 0
table(dynamic$sig_n)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 4650 3939 3965 2624 3043 1430 746 393 218 104 39 18 4 1
table(dynamic$up_n)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 10140 4466 2767 1685 1239 532 227 74 26 11 5 1 1
table(dynamic$down_n)
##
## 0 1 2 3 4 5 6 7 9
## 10294 4036 2912 1449 2044 346 78 14 1
plot(table(dynamic$sig_n))
ccount = function(x){return(-count(x))}
ggplot(dynamic) + geom_histogram(aes(x=up_n), fill="blue", binwidth = 1, colour="grey") +
geom_histogram( aes(x=down_n, y=-(..count..)), fill="red", binwidth = 1, colour="grey")
#write.table(result, file.path(ous_dir,"Feb22_RNAseq/03limma/decideResults_pairwise_summary.tab" ), sep="\t")
binary = result
binary[binary == -1] = 1
write.table(binary, file.path(ous_dir,"Feb22_RNAseq/03limma/decideResults_pairwise_binary.txt" ), sep="\t")
top = ggplot(dynamic) + geom_histogram(aes(x=up_n), fill="blue", binwidth = 1, colour="grey")
b= ggplot(dynamic) + geom_histogram( aes(x=down_n, y=-(..count..)), fill="red", binwidth = 1, colour="grey")
cowplot::plot_grid(top,b, ncol=1,nrow=2)
Step 1: make rpkms easier to do from the total than re-merge rpkms from previous analyses
rpkm = as.data.frame(rpkm(filt_series, normalized.lib.sizes = T))
rpkm$gene = rownames(rpkm)
head(rpkm)
## 6.19190_S20 7.19191_S1 8.19192_S3 9.19193_S6 10.19194_S9
## ENSG00000000003 4.429771 8.156323470 4.38952419 10.6804534 6.3980668
## ENSG00000000005 0.000000 0.009483402 0.04928463 0.0597703 1.3456444
## ENSG00000000419 44.399215 35.471356378 41.58979026 43.5042255 52.6514494
## ENSG00000000457 1.238538 2.004196996 1.73642745 2.1702937 1.8297088
## ENSG00000000460 1.180203 0.616667448 0.79787079 0.6343311 0.5946063
## ENSG00000000938 0.037342 0.039555121 0.06281165 0.1200338 0.2892193
## 11.19195_S12 12.19196_S15 13.19197_S18 14.19198_S21 15.19199_S2
## ENSG00000000003 6.801005854 5.34283930 8.97825018 6.20967066 10.8402147
## ENSG00000000005 0.000000000 0.00000000 0.01544879 0.03209898 2.4098978
## ENSG00000000419 45.991978584 39.33891270 49.48751378 50.78492239 42.9456144
## ENSG00000000457 1.295959535 1.57592908 1.85270325 1.76943419 2.0765218
## ENSG00000000460 2.466087279 0.52619404 0.65859945 0.48500820 0.6567233
## ENSG00000000938 0.004942582 0.02557923 0.03579814 0.20826453 0.2264708
## 1.19200_S5 2.19201_S8 3.19202_S11 4.19203_S14 5.19204_S17
## ENSG00000000003 9.06770306 5.32941019 5.84949758 6.0114837 9.2254928
## ENSG00000000005 0.00000000 0.08398167 0.60139195 0.2409799 4.4756616
## ENSG00000000419 33.13432809 39.93582431 41.36207525 44.8663886 41.5605299
## ENSG00000000457 1.48149383 1.92021291 1.75979885 1.9199313 2.2089933
## ENSG00000000460 1.30065327 0.54949797 0.75463451 0.4644325 0.6545097
## ENSG00000000938 0.01705362 0.01946035 0.03693757 0.1435891 0.6182750
## 20.S2.KD5NT_S16 21.S7.KD6NT_S19 22.S12.KD3NT_S22 1.KD4NT_S1
## ENSG00000000003 2.91981053 7.24007579 6.04191622 2.721130
## ENSG00000000005 0.01819565 0.03092287 0.00000000 0.000000
## ENSG00000000419 53.71154817 39.44174826 45.42236856 97.377053
## ENSG00000000457 1.02998553 1.53342908 1.72161788 1.638974
## ENSG00000000460 3.09922167 2.03999184 1.65032753 1.159884
## ENSG00000000938 0.00000000 0.00000000 0.00566926 0.000000
## 10.P6D3_S10 11.P6D9_S11 12.KD3NT_S12 13.P8D0_S13 14.P8D1_S14
## ENSG00000000003 3.04182092 2.3619870 2.007633 2.37629595 2.47737266
## ENSG00000000005 0.03330655 0.3159819 0.000000 0.00000000 0.00000000
## ENSG00000000419 55.34687707 61.8625718 59.451797 42.01958949 48.28312360
## ENSG00000000457 2.02064095 1.9257976 1.452945 1.24041816 1.42477241
## ENSG00000000460 0.46371294 0.3505026 1.110480 0.27099166 0.32147046
## ENSG00000000938 0.10033191 0.3091501 0.000000 0.08574259 0.01490318
## 15.P8D3_S15 16.P8D9_S16 2.KD5NT_S2 3.P5D0_S3 4.P5D1_S4
## ENSG00000000003 2.40701054 1.4325053 2.17149640 2.05304718 2.05323845
## ENSG00000000005 0.01942692 0.0000000 0.00000000 0.00000000 0.07376914
## ENSG00000000419 55.26450018 30.4398764 68.43831575 54.29605373 62.97810749
## ENSG00000000457 1.99078950 1.9511371 1.14632438 1.56093866 1.77664156
## ENSG00000000460 0.25160256 0.2157656 2.96071334 0.46373842 0.40604554
## ENSG00000000938 0.33312081 0.3322643 0.01986473 0.00937088 0.03418777
## 5.P5D3_S5 6.P5D9_S6 7.KD6NT_S7 8.P6D0_S8 9.P6D1_S9
## ENSG00000000003 2.85095385 3.1988551 2.11611139 3.65396967 2.11199568
## ENSG00000000005 0.11659502 0.5437334 0.01486434 0.00000000 0.03562279
## ENSG00000000419 57.48804753 57.4549878 71.05122695 43.42671695 75.29026376
## ENSG00000000457 1.93550926 2.1379258 1.45307098 1.94983340 1.81719641
## ENSG00000000460 0.46402550 0.5134805 1.34958572 0.38134185 0.34121346
## ENSG00000000938 0.08105267 0.3779840 0.01377754 0.09247046 0.01650912
## 13.21417_S53 14.21418_S2 15.21419_S9 16.21420_S16 17.21421_S24
## ENSG00000000003 14.2865244 19.9670327 12.9109419 13.8343917 16.5892979
## ENSG00000000005 0.6167761 0.7768957 0.3267391 3.0427612 2.8011821
## ENSG00000000419 36.1991881 28.8368126 25.0081755 46.1827349 43.6749895
## ENSG00000000457 1.9090162 1.3601721 1.4215021 1.9288800 2.0130346
## ENSG00000000460 0.8320844 0.9401136 0.6962417 0.6710995 0.6981899
## ENSG00000000938 3.1995683 4.6260542 3.4460243 3.6745701 4.3981194
## 18.21422_S32 19.21423_S40 20.21424_S47 21.21425_S54 22.21426_S3
## ENSG00000000003 17.1528743 9.0381035 8.8393788 9.5489457 7.6205880
## ENSG00000000005 1.9451814 0.9721878 0.6045077 0.2642974 1.8124435
## ENSG00000000419 41.6803305 38.4294219 35.1662323 33.6787557 37.5163734
## ENSG00000000457 1.9813518 1.9312377 1.5891691 1.6755817 1.7366544
## ENSG00000000460 0.7506485 0.7282125 0.6000864 0.4106589 0.4360015
## ENSG00000000938 1.8029603 0.5715975 0.6481190 0.6652296 1.5888471
## 23.21427_S10 24.21428_S17 gene
## ENSG00000000003 8.960145 8.4071871 ENSG00000000003
## ENSG00000000005 2.368496 1.6611412 ENSG00000000005
## ENSG00000000419 36.592366 34.4896434 ENSG00000000419
## ENSG00000000457 1.826420 1.3572695 ENSG00000000457
## ENSG00000000460 0.463495 0.4265672 ENSG00000000460
## ENSG00000000938 2.374268 1.7839139 ENSG00000000938
Step2 make long for plotting
21k * 2 donors * 7 conditions * 3 reps (+4 extra reps)
long = tidyr::pivot_longer(rpkm, 1:(ncol(rpkm)-1), names_to="sample_id", values_to = "rpkm")
head(long)
## # A tibble: 6 x 3
## gene sample_id rpkm
## <chr> <chr> <dbl>
## 1 ENSG00000000003 6.19190_S20 4.43
## 2 ENSG00000000003 7.19191_S1 8.16
## 3 ENSG00000000003 8.19192_S3 4.39
## 4 ENSG00000000003 9.19193_S6 10.7
## 5 ENSG00000000003 10.19194_S9 6.40
## 6 ENSG00000000003 11.19195_S12 6.80
#add sample info
long = merge(long, filt_series$samples)
head(long)
## sample_id gene rpkm group lib.size norm.factors
## 1 1.19200_S5 ENSG00000167476 0.04557251 -2.D13-A.bulk 24704720 1.366482
## 2 1.19200_S5 ENSG00000209082 22.51281884 -2.D13-A.bulk 24704720 1.366482
## 3 1.19200_S5 ENSG00000136051 14.23897850 -2.D13-A.bulk 24704720 1.366482
## 4 1.19200_S5 ENSG00000203808 0.11244166 -2.D13-A.bulk 24704720 1.366482
## 5 1.19200_S5 ENSG00000203780 0.83159053 -2.D13-A.bulk 24704720 1.366482
## 6 1.19200_S5 ENSG00000221930 3.02058173 -2.D13-A.bulk 24704720 1.366482
## time donor biorep exp construct separation time.donor
## 1 -2 D13-A 5 2 native bulk -2.D13-A
## 2 -2 D13-A 5 2 native bulk -2.D13-A
## 3 -2 D13-A 5 2 native bulk -2.D13-A
## 4 -2 D13-A 5 2 native bulk -2.D13-A
## 5 -2 D13-A 5 2 native bulk -2.D13-A
## 6 -2 D13-A 5 2 native bulk -2.D13-A
#add gene info
long = merge(long, filt_series$genes, by.x="gene", by.y="Geneid")
head(long)
## gene sample_id rpkm group lib.size norm.factors
## 1 ENSG00000000003 8.19192_S3 4.389524 1.D13-A.bulk 44801786 1.1251937
## 2 ENSG00000000003 24.21428_S17 8.407187 15.D13-A.bulk 51980503 1.0430259
## 3 ENSG00000000003 15.P8D3_S15 2.407011 3.D07-G.bulk 36812579 0.8685081
## 4 ENSG00000000003 12.KD3NT_S12 2.007633 -2.D07-G.bulk 52234909 1.1123288
## 5 ENSG00000000003 3.P5D0_S3 2.053047 0.D07-G.bulk 35877163 0.8561931
## 6 ENSG00000000003 20.21424_S47 8.839379 15.D07-G.bulk 68511442 1.0048101
## time donor biorep exp construct separation time.donor Length gene_name
## 1 1 D13-A 7 2 native bulk 1.D13-A 4535 TSPAN6
## 2 15 D13-A P7 native native bulk 15.D13-A 4535 TSPAN6
## 3 3 D07-G P8 1 native bulk 3.D07-G 4535 TSPAN6
## 4 -2 D07-G P8 1 native bulk -2.D07-G 4535 TSPAN6
## 5 0 D07-G P5 1 native bulk 0.D07-G 4535 TSPAN6
## 6 15 D07-G P8_2 native native bulk 15.D07-G 4535 TSPAN6
## description
## 1 tetraspanin 6
## 2 tetraspanin 6
## 3 tetraspanin 6
## 4 tetraspanin 6
## 5 tetraspanin 6
## 6 tetraspanin 6
Average per condition
21k * 2 donors * 7 conditions = ~294k
averages = group_by(long, group, gene, time, donor, gene_name, separation, description) %>% summarise(mean=mean(rpkm))
## `summarise()` has grouped output by 'group', 'gene', 'time', 'donor', 'gene_name', 'separation'. You can override using the `.groups` argument.
head(averages); dim(averages)
## # A tibble: 6 x 8
## # Groups: group, gene, time, donor, gene_name, separation [6]
## group gene time donor gene_name separation description mean
## <fct> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 -2.D07-G.bulk ENSG0~ -2 D07-G TSPAN6 bulk "tetraspanin 6 " 3.60e+0
## 2 -2.D07-G.bulk ENSG0~ -2 D07-G TNMD bulk "tenomodulin " 9.14e-3
## 3 -2.D07-G.bulk ENSG0~ -2 D07-G DPM1 bulk "dolichyl-phosp~ 6.21e+1
## 4 -2.D07-G.bulk ENSG0~ -2 D07-G SCYL3 bulk "SCY1 like pseu~ 1.43e+0
## 5 -2.D07-G.bulk ENSG0~ -2 D07-G C1orf112 bulk "chromosome 1 o~ 1.91e+0
## 6 -2.D07-G.bulk ENSG0~ -2 D07-G FGR bulk "FGR proto-onco~ 5.62e-3
## [1] 296436 8
#NB seems that scale in Rv4 returns a matrix, rather than a vector :P
averages = group_by(averages, gene) %>% mutate(z_score=scale(mean)[,1])
summary(averages[c("mean", "z_score")])
## mean z_score
## Min. : 0.00 Min. :-3.1098
## 1st Qu.: 0.19 1st Qu.:-0.6763
## Median : 1.23 Median :-0.2368
## Mean : 17.84 Mean : 0.0000
## 3rd Qu.: 5.25 3rd Qu.: 0.5459
## Max. :44097.56 Max. : 3.4738
head(averages); dim(averages)
## # A tibble: 6 x 9
## # Groups: gene [6]
## group gene time donor gene_name separation description mean z_score
## <fct> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 -2.D0~ ENSG0~ -2 D07-G TSPAN6 bulk "tetraspanin 6~ 3.60e+0 -0.771
## 2 -2.D0~ ENSG0~ -2 D07-G TNMD bulk "tenomodulin " 9.14e-3 -0.654
## 3 -2.D0~ ENSG0~ -2 D07-G DPM1 bulk "dolichyl-phos~ 6.21e+1 1.73
## 4 -2.D0~ ENSG0~ -2 D07-G SCYL3 bulk "SCY1 like pse~ 1.43e+0 -1.45
## 5 -2.D0~ ENSG0~ -2 D07-G C1orf112 bulk "chromosome 1 ~ 1.91e+0 2.50
## 6 -2.D0~ ENSG0~ -2 D07-G FGR bulk "FGR proto-onc~ 5.62e-3 -0.604
## [1] 296436 9
Save average rpkms in wide format
averages$sample = gsub("-",".",paste("day", averages$time, averages$donor, averages$separation, sep="_"))
unique(averages$sample)
## [1] "day_.2_D07.G_bulk" "day_.2_D13.A_bulk" "day_0_D07.G_bulk"
## [4] "day_0_D13.A_bulk" "day_1_D07.G_bulk" "day_1_D13.A_bulk"
## [7] "day_15_D07.G_bulk" "day_15_D07.G_floating" "day_15_D13.A_bulk"
## [10] "day_15_D13.A_floating" "day_3_D07.G_bulk" "day_3_D13.A_bulk"
## [13] "day_9_D07.G_bulk" "day_9_D13.A_bulk"
wide = pivot_wider(averages[c("gene_name", "sample", "mean")], names_from ="sample", values_from = "mean",
values_fn=mean) #using gene names has a few overlaps, if so just take the mean
head(wide); nrow(wide)
## # A tibble: 6 x 15
## gene_name day_.2_D07.G_bulk day_.2_D13.A_bu~ day_0_D07.G_bulk day_0_D13.A_bulk
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 TSPAN6 3.60 6.77 2.69 6.28
## 2 TNMD 0.00914 0 0 0.0312
## 3 DPM1 62.1 41.2 46.6 38.2
## 4 SCYL3 1.43 1.34 1.58 1.83
## 5 C1orf112 1.91 1.65 0.372 0.564
## 6 FGR 0.00562 0.0198 0.0625 0.0282
## # ... with 10 more variables: day_1_D07.G_bulk <dbl>, day_1_D13.A_bulk <dbl>,
## # day_15_D07.G_bulk <dbl>, day_15_D07.G_floating <dbl>,
## # day_15_D13.A_bulk <dbl>, day_15_D13.A_floating <dbl>,
## # day_3_D07.G_bulk <dbl>, day_3_D13.A_bulk <dbl>, day_9_D07.G_bulk <dbl>,
## # day_9_D13.A_bulk <dbl>
## [1] 21131
write.table(wide, file.path(ous_dir, "Feb22_RNAseq/03limma/rpkm_tmm_means.csv"), sep=",", row.names = F)
Fix the time factors, also adding a psuedotime where floating day15 samples are beyond bulk day15s
averages$time = factor(gsub("-",".",averages$time), levels=c(".2",0,1,3,9,15))
averages$pstime = as.character(averages$time)
averages$pstime[averages$separation == "floating"] = "15float"
averages$pstime = factor(averages$pstime, levels=c(".2",0,1,3,9,15, "15float"))
write.table(averages, file.path(ous_dir, "Feb22_RNAseq/03limma/plotting_mean_rpkm.tab"), sep="\t")
ggplot(averages[averages$gene_name == "PPARG",]) +
geom_line(aes(x=pstime, y=mean, group= donor, colour=donor))
ggplot(averages[averages$gene_name == "HOTAIR",]) +
geom_line(aes(x=pstime, y=mean, group= donor, colour=donor))
ggplot(averages[averages$gene_name == "FABP4",]) +
geom_line(aes(x=pstime, y=mean, group= donor, colour=donor))
ggplot(averages[averages$gene_name == "CEBPA",]) +
geom_line(aes(x=pstime, y=mean, group= donor, colour=donor))
ggplot(averages[averages$gene_name == "CIDEA",]) +
geom_line(aes(x=pstime, y=mean, group= donor, colour=donor))
## Load clusters and plot with the extra time points
This will give some initial indication as to whether cluster identity can in any way predict day15 expression.
oriclust = read.delim(file.path(ous_dir, "D13rnaseqDec19/clust/match&merge_feb20/clusters_11-02-2020.tab"),
header=T)
head(oriclust)
## gene gene_name
## 1 ENSG00000003249 DBNDD1
## 2 ENSG00000067715 SYT1
## 3 ENSG00000070808 CAMK2A
## 4 ENSG00000084628 NKAIN1
## 5 ENSG00000091137 SLC26A4
## 6 ENSG00000101638 ST8SIA5
## description
## 1 dysbindin domain containing 1
## 2 synaptotagmin 1
## 3 calcium/calmodulin dependent protein kinase II alpha
## 4 sodium/potassium transporting ATPase interacting 1
## 5 solute carrier family 26 member 4
## 6 ST8 alpha-N-acetyl-neuraminide alpha-2,8-sialyltransferase 5
## merged.cluster.glut merged.cluster.abdo genes.abdo Ave.Expr
## 1 0 1 1505 -2.4906993
## 2 0 1 1505 -3.0152061
## 3 0 1 1505 -2.8504277
## 4 0 1 1505 -1.7988766
## 5 0 1 1505 -0.7368722
## 6 0 1 1505 -2.4129353
## total_time_score adj.P.Val.time total_donor_effect adj.P.Val.donor
## 1 -7.998479 3.452301e-06 2.20357137 5.538658e-01
## 2 -13.510137 1.175280e-09 9.10389182 4.147443e-05
## 3 -9.301606 4.022025e-08 2.93110477 7.654837e-03
## 4 -5.570890 1.927477e-07 -0.16189159 1.359138e-04
## 5 -2.443903 6.648553e-04 -0.03461714 1.564329e-01
## 6 -5.820780 1.439061e-07 14.30018755 3.549713e-09
## cluster.glut cluster.abdo new.cluster.abdo
## 1 0 3 1
## 2 0 3 1
## 3 0 3 1
## 4 0 3 1
## 5 0 3 1
## 6 0 3 1
oriclust$merged.cluster.glut = factor(oriclust$merged.cluster.glut, levels = c(0:10))
oriclust$merged.cluster.abdo = factor(oriclust$merged.cluster.abdo, levels = c(0:16))
earclust = merge(averages, oriclust[c("gene","merged.cluster.glut","merged.cluster.abdo")])
earclust$gene.donor = paste(earclust$gene_name, sep=".", earclust$donor)
head(earclust)
## gene group time donor gene_name separation description
## 1 ENSG00000000005 1.D13-A.bulk 1 D13-A TNMD bulk tenomodulin
## 2 ENSG00000000005 0.D13-A.bulk 0 D13-A TNMD bulk tenomodulin
## 3 ENSG00000000005 -2.D07-G.bulk .2 D07-G TNMD bulk tenomodulin
## 4 ENSG00000000005 15.D13-A.bulk 15 D13-A TNMD bulk tenomodulin
## 5 ENSG00000000005 3.D07-G.bulk 3 D07-G TNMD bulk tenomodulin
## 6 ENSG00000000005 9.D07-G.bulk 9 D07-G TNMD bulk tenomodulin
## mean z_score sample pstime merged.cluster.glut
## 1 0.222041790 -0.4398654 day_1_D13.A_bulk 1 8
## 2 0.031155024 -0.6319792 day_0_D13.A_bulk 0 8
## 3 0.009140408 -0.6541353 day_.2_D07.G_bulk .2 8
## 4 1.947360246 1.2965432 day_15_D13.A_bulk 15 8
## 5 0.056442830 -0.6065288 day_3_D07.G_bulk 3 8
## 6 0.286571779 -0.3749206 day_9_D07.G_bulk 9 8
## merged.cluster.abdo gene.donor
## 1 8 TNMD.D13-A
## 2 8 TNMD.D13-A
## 3 8 TNMD.D07-G
## 4 8 TNMD.D13-A
## 5 8 TNMD.D07-G
## 6 8 TNMD.D07-G
In glut:
ggplot(earclust[earclust$donor == "D07-G",], aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.2) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
facet_wrap(~merged.cluster.glut)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$donor == "D07-G",], aes(x=pstime, y=z_score)) +
geom_boxplot(aes(group=pstime, fill=time == 15)) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
facet_wrap(~merged.cluster.glut)
## `geom_smooth()` using formula 'y ~ x'
Donor 7 analysis: For cluster 8 (adipogenesis) genes, most drop in expression from day9 to bulk day15. However, in floating cells cluster 8 genes remain highly expressed on average. There is a large variation in the expression of cluster8 genes in floating samples; indicating the
ggplot(earclust[earclust$donor == "D13-A",], aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.2) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
facet_wrap(~merged.cluster.abdo)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$donor == "D13-A",], aes(x=pstime, y=z_score)) +
geom_boxplot(aes(group=pstime, fill=time == 15)) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
facet_wrap(~merged.cluster.abdo)
## `geom_smooth()` using formula 'y ~ x'
Abdo custer 8 has the same trend, but it is less pronounced, as bulk and floating day15 expression patterns are more similar.
Overall I would say that the floating cells tend to continue the trend of the clusters, whilst bulk samples are more often have different gene expression between day9 and day15.
ggplot(earclust[earclust$merged.cluster.glut == 8,], aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.05) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~donor)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.glut == 1,], aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.05) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~donor)
## `geom_smooth()` using formula 'y ~ x'
Clearly significant genes in cluster 8 tend to decrease expr d9->bulk d15. And have a tendency to continue decreasing in floating…. though thats’ only bae on he loess curve, the underlying data is very dispersed.
ggplot(earclust[earclust$merged.cluster.glut == 8 & earclust$donor == "D07-G",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.05) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~gene %in% rownames(result)[!result[,"d9_to_d15.D7.bulk"] == 0 ])
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.abdo == 8 & earclust$donor == "D13-A",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.05) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
scale_color_discrete(name="new data") + facet_wrap(~gene %in% rownames(result)[!result[,"d9_to_d15.D13.bulk"] == 0 ])
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.glut == 1 & earclust$donor == "D07-G",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~gene %in% rownames(result)[!result[,"d9_to_d15.D7.bulk"] == 0 ])
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.glut == 1 & earclust$donor == "D07-G",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~gene %in% rownames(result)[!result[,"d9_to_d15.D7.floating"] == 0 ])
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.abdo == 1 & earclust$donor == "D13-A",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15, "15float")),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
scale_color_discrete(name="new data") + facet_wrap(~gene %in% rownames(result)[!result[,"d9_to_d15.D13.bulk"] == 0 ])
## `geom_smooth()` using formula 'y ~ x'
The final state of cluster1 genes is less predictable from their dynamics prior to diff. onset. There is a big donor difference: in donor7 cluster one genes seems o be evenly split between being up/down regulated in both fractions or in just one. Whilst in D13 the trends are more mild.
If we can make a column containg the d15 state information: e.g. float_up, D7.float_up, D13.float_up, bulk_up, ect. then we can facet based on that column to get a more granular cluster view.
#head(result)
granule =result[,grep("d9_to_d15.*",colnames(result))]
granule[granule == 1] = "up"
granule[granule == -1] = "down"
granule = data.frame(granule)
for (i in 1:nrow(granule)){
if (granule$d9_to_d15.D7.floating[i] == granule$d9_to_d15.D13.floating[i] &
granule$d9_to_d15.D7.bulk[i] == granule$d9_to_d15.D13.bulk[i] &
granule$d9_to_d15.D7.floating[i] == granule$d9_to_d15.D13.bulk[i]){
granule$d15_status[i] = paste("day15", sep="_", granule$d9_to_d15.D7.floating[i])
#sig in bulk and floating
} else if (granule$d9_to_d15.D7.floating[i] == granule$d9_to_d15.D13.floating[i]){
granule$d15_status[i] = paste("float", sep="_", granule$d9_to_d15.D7.floating[i])
#sig in floating
} else if (granule$d9_to_d15.D7.bulk[i] == granule$d9_to_d15.D13.bulk[i]){
#sig in bulk
granule$d15_status[i] = paste("bulk", sep="_", granule$d9_to_d15.D7.bulk[i])
} else if (granule$d9_to_d15.D7.floating[i] == granule$d9_to_d15.D7.bulk[i]){
#sig in donor7
granule$d15_status[i] = paste("d7", sep="_", granule$d9_to_d15.D7.bulk[i])
} else if (granule$d9_to_d15.D13.floating[i] == granule$d9_to_d15.D13.bulk[i]){
#sig in donor13
granule$d15_status[i] = paste("d13", sep="_", granule$d9_to_d15.D13.bulk[i])
} else {
interesting = colnames(granule)
granule$d15_status[i] = paste(colnames(granule)[1:ncol(granule)-1][granule[i,1:(ncol(granule)-1)] != 0], collapse="_&_")
}
}
summary(as.factor(granule$d15_status))
## bulk_0
## 3147
## bulk_down
## 310
## bulk_up
## 578
## d13_0
## 60
## d13_down
## 4
## d13_up
## 3
## d7_0
## 396
## d7_down
## 394
## d7_up
## 586
## d9_to_d15.D7.bulk_&_d9_to_d15.D13.floating
## 87
## d9_to_d15.D7.floating_&_d9_to_d15.D13.bulk
## 325
## day15_0
## 8168
## day15_down
## 1726
## day15_up
## 886
## float_0
## 2515
## float_down
## 714
## float_up
## 1275
merging into plottng dataframe
granule$gene = rownames(granule)
earclust = merge(earclust, granule[c('gene','d15_status')])
head(earclust)
## gene group time donor gene_name separation description
## 1 ENSG00000000005 1.D13-A.bulk 1 D13-A TNMD bulk tenomodulin
## 2 ENSG00000000005 0.D13-A.bulk 0 D13-A TNMD bulk tenomodulin
## 3 ENSG00000000005 -2.D07-G.bulk .2 D07-G TNMD bulk tenomodulin
## 4 ENSG00000000005 15.D13-A.bulk 15 D13-A TNMD bulk tenomodulin
## 5 ENSG00000000005 3.D07-G.bulk 3 D07-G TNMD bulk tenomodulin
## 6 ENSG00000000005 9.D07-G.bulk 9 D07-G TNMD bulk tenomodulin
## mean z_score sample pstime merged.cluster.glut
## 1 0.222041790 -0.4398654 day_1_D13.A_bulk 1 8
## 2 0.031155024 -0.6319792 day_0_D13.A_bulk 0 8
## 3 0.009140408 -0.6541353 day_.2_D07.G_bulk .2 8
## 4 1.947360246 1.2965432 day_15_D13.A_bulk 15 8
## 5 0.056442830 -0.6065288 day_3_D07.G_bulk 3 8
## 6 0.286571779 -0.3749206 day_9_D07.G_bulk 9 8
## merged.cluster.abdo gene.donor d15_status
## 1 8 TNMD.D13-A day15_0
## 2 8 TNMD.D13-A day15_0
## 3 8 TNMD.D07-G day15_0
## 4 8 TNMD.D13-A day15_0
## 5 8 TNMD.D07-G day15_0
## 6 8 TNMD.D07-G day15_0
earclust$d15_status = factor(earclust$d15_status,
levels = c(
paste("day15",sep="_",c("up","down",0)),
paste("float", sep="_", c("up","down",0)),
paste("bulk", sep="_", c("up","down",0)),
paste("d7", sep="_", c("up","down",0)),
paste("d13", sep="_", c("up","down",0)),
"d9_to_d15.D7.bulk_&_d9_to_d15.D13.floating",
"d9_to_d15.D7.floating_&_d9_to_d15.D13.bulk"
))
summary(earclust$d15_status)
## day15_up
## 7042
## day15_down
## 8694
## day15_0
## 34916
## float_up
## 7518
## float_down
## 3990
## float_0
## 19880
## bulk_up
## 5782
## bulk_down
## 2016
## bulk_0
## 16506
## d7_up
## 2814
## d7_down
## 1890
## d7_0
## 2198
## d13_up
## 28
## d13_down
## 28
## d13_0
## 714
## d9_to_d15.D7.bulk_&_d9_to_d15.D13.floating
## 798
## d9_to_d15.D7.floating_&_d9_to_d15.D13.bulk
## 1372
earclust[is.na(earclust$d15_status),]
## [1] gene group time
## [4] donor gene_name separation
## [7] description mean z_score
## [10] sample pstime merged.cluster.glut
## [13] merged.cluster.abdo gene.donor d15_status
## <0 rows> (or 0-length row.names)
Plot cluster 8 split by d15 status
Excepting genes that don’t change in day9 to day15;
The largest groups of genes are those downregulated in both bulk and floating cells; and those up in floating cells. There’s also some genes downregulated in bulk only.
ggplot(earclust[earclust$merged.cluster.glut == 8 & earclust$donor == "D07-G",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.abdo == 8 & earclust$donor == "D13-A",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.abdo == 8 & earclust$donor == "D13-A" &
grepl("^(day15|float|bulk)",earclust$d15_status),],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.glut == 8 & earclust$donor == "D07-G" &
grepl("^(day15|float|bulk)",earclust$d15_status),],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status) + theme_bw(base_size=20)
## `geom_smooth()` using formula 'y ~ x'
In cluster 1 the largest groups are upregulated
ggplot(earclust[earclust$merged.cluster.glut == 1 & earclust$donor == "D07-G",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.4) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.abdo == 1 & earclust$donor == "D13-A",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.4) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.abdo)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[earclust$merged.cluster.glut == 1 & earclust$donor == "D07-G" &
grepl("^(day15|float|bulk)",earclust$d15_status),],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=time %in% c(9, 15)),
alpha=0.1) +
geom_smooth(method=loess, colour="black", se=F, aes(group=merged.cluster.glut)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status) + theme_bw(base_size=20)
## `geom_smooth()` using formula 'y ~ x'
Excepting the largest clusters 1 and 8, below I look at the distribution of genes DE in adipocytes among RNAseq clusters from early differentiation.
Much like cluster1 genes, cluster 2 and 3 genes are well represented in genes that are upregulated at day15. Whilst genes that are downregulated at day15 are more likely to have been in cluster 6 or 7. Genes upregulated in floating cells are a mix of early differentiation clusters. Genes downregulated in floating cells likewise largely belong to clusters 6 and 7. Genes upregulated in bulk adipocytes likewise belong largely to clusters 2 and 3.
This makes some sense. Genes that were already decreasing expression by day9, tend to continue that trend by day 15. Genes that decreased expression early in differentiation are equally likely to regain expression in adipocytes, particularly in the bulk fraction where a more heterogenous population of cells exists. Gene expression levels needed to drive adipogenesis are different tha those require to maintain mature adipocyte state.
ggplot(earclust[!earclust$merged.cluster.abdo %in% c(1,8) & earclust$donor == "D13-A",],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=merged.cluster.abdo),
alpha=0.2) +
geom_smooth(method=loess, colour="black", se=F, aes(group=d15_status)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status) +
guides(colour = guide_legend(override.aes = list(alpha = 1))) + theme_classic(base_size=20)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[!earclust$merged.cluster.abdo %in% c(1,8) & earclust$donor == "D13-A" &
earclust$d15_status %in% c(paste("day15",sep="_",c("up","down")),
paste("float", sep="_", c("up","down")),
paste("bulk", sep="_", c("up","down"))),],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=merged.cluster.abdo),
alpha=0.4) +
geom_smooth(method=loess, colour="black", se=F, aes(group=d15_status)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status) +
guides(colour = guide_legend(override.aes = list(alpha = 1))) + theme_classic(base_size = 20)
## `geom_smooth()` using formula 'y ~ x'
ggplot(earclust[!earclust$merged.cluster.glut %in% c(1,8) & earclust$donor == "D07-G" &
earclust$d15_status %in% c(paste("day15",sep="_",c("up","down")),
paste("float", sep="_", c("up","down")),
paste("bulk", sep="_", c("up","down"))),],
aes(x=pstime, y=z_score)) +
geom_line(aes(group=gene.donor, colour=merged.cluster.glut),
alpha=0.4) +
geom_smooth(method=loess, colour="black", se=F, aes(group=d15_status)) +
scale_color_discrete(name="new data") +facet_wrap(~d15_status) +
guides(colour = guide_legend(override.aes = list(alpha = 1))) + theme_classic(base_size = 20)
## `geom_smooth()` using formula 'y ~ x'
##saveearclust here
##Other than reclustering, we can visualise those DE day9-> day15
#clusters made on previous sig genes (minus the donor specific genes)
early_sig = unique(earclust$gene)
length(early_sig) #8299
## [1] 8299
#sig genes with extra times
all_sig = unique(comb$Geneid[comb$adj.P.Val < 0.01])
length(all_sig) #11639
## [1] 11639
summary(early_sig %in% all_sig) #2529 genes are dropped when day15 included
## Mode FALSE TRUE
## logical 2529 5770
summary(all_sig %in% early_sig) #5869 new genes are different n the day15 samples
## Mode FALSE TRUE
## logical 5869 5770
how many day15 sig genes are donor-specific? (as in not expressed in the other donor). Just 1! that’s incredible
dropouts = group_by(averages, donor, gene) %>% filter(mean == 0) %>% count()
summary(dropouts$n >6)
## Mode FALSE TRUE
## logical 3153 3
summary(dropouts$gene %in% all_sig & dropouts$n > 6)
## Mode FALSE TRUE
## logical 3155 1
summary(dropouts$gene %in% early_sig & dropouts$n > 6)
## Mode FALSE
## logical 3156
bulk = topTable(pair_fit, coef = c("d9_to_d15.D7.bulk", "d9_to_d15.D13.bulk"), number = nrow(pair_fit$genes), p.value = 0.01)
summary(bulk$d9_to_d15.D7.bulk * bulk$d9_to_d15.D13.bulk < 0)
## Mode FALSE TRUE
## logical 8413 472
bulk = bulk[bulk$d9_to_d15.D7.bulk * bulk$d9_to_d15.D13.bulk > 0,] #sig genes must have the same sign
head(bulk); nrow(bulk) #8k genes diff between day9 and 15 :o
## Geneid Length gene_name
## ENSG00000214199 ENSG00000214199 1346 EEF1A1P12
## ENSG00000236972 ENSG00000236972 411 FABP5P1
## ENSG00000249855 ENSG00000249855 1382 EEF1A1P19
## ENSG00000259308 ENSG00000259308 404 AC024270.1
## ENSG00000230383 ENSG00000230383 862 AC009245.1
## ENSG00000236044 ENSG00000236044 408 FABP5P2
## description
## ENSG00000214199 eukaryotic translation elongation factor 1 alpha 1 pseudogene 12
## ENSG00000236972 fatty acid binding protein 5 pseudogene 1
## ENSG00000249855 eukaryotic translation elongation factor 1 alpha 1 pseudogene 19
## ENSG00000259308 fatty acid binding protein 5 (psoriasis-associated) (FABP5) pseudogene
## ENSG00000230383 ribosomal protein L6 (RPL6) pseudogene
## ENSG00000236044 fatty acid binding protein 5 pseudogene 2
## d9_to_d15.D7.bulk d9_to_d15.D13.bulk AveExpr F
## ENSG00000214199 -7.303966 -7.095949 5.1116184 394.7706
## ENSG00000236972 -5.867698 -5.688479 1.6957432 340.9487
## ENSG00000249855 -4.249684 -4.265037 6.9259314 302.0047
## ENSG00000259308 -8.622021 -7.743733 -2.6290392 299.3378
## ENSG00000230383 -4.584224 -5.761866 5.0965401 289.0667
## ENSG00000236044 -6.926677 -5.769900 0.8331876 279.7052
## P.Value adj.P.Val
## ENSG00000214199 2.771171e-26 5.867678e-22
## ENSG00000236972 3.504469e-25 3.710181e-21
## ENSG00000249855 3.570235e-24 1.944460e-20
## ENSG00000259308 3.673297e-24 1.944460e-20
## ENSG00000230383 7.845518e-24 3.322420e-20
## ENSG00000236044 1.314769e-23 4.639821e-20
## [1] 8413